#################################################################################
# Replication file for:                                                         #
# "Balancing Precision and Retention in Experimental Design"                    #
#                                                                               #
# Gustavo Diaz                                                                  #
# Northwestern University                                                       #
# gustavo.diaz@northwestern.edu                                                 #
#                                                                               #
# Erin L. Rossiter                                                              #
# University of Notre Dame                                                      #
# erossite@nd.edu                                                               #
#                                                                               #
# This file conducts the block randomization for the Tappin and Hewitt          #
# replication, saving output for use in the Wave 2 Qualtrics survey.            #
#################################################################################

# Data -----

# Note: reading in the merged data, but when this script was originally used,
# only the Wave 1 variables had been collected. Also, the original
# script used the participant's Connect Identifiers. This script was adapted
# to use the anonymous id created for the replication archive.
df <- readRDS("./data/raw_data/TappinHewittReplication.rds")

# Exclusions -----

## Exclude who did not finish Wave 1
df <- df %>% filter(finished_w1 == 1)

## Exclude who failed attention check
df <- df %>% filter(attention_check == 12)

# Clean variables for blocking -----

df <- df %>%
  # clean demographics into dummy variables
  mutate(gender_m = gender == 1,
         gender_w = gender == 2,
         gender_nb = gender == 3) %>%
  mutate(race_white = grepl(1, race),
         race_black_africanam = grepl(2, race),
         race_amindian_aknative = grepl(3, race),
         race_asian = grepl(4, race),
         race_nativehawaiian_pi = grepl(5, race),
         race_hispanic_latino = grepl(6, race),
         race_arab_middleeastern = grepl(7, race),
         race_other = grepl(8, race)) %>%
  # include leaners in a binary partisan classification
  mutate(pid3 = case_when(
    pid == 1 | pid_lean == 2 ~ "D",
    pid == 2 | pid_lean == 1 ~ "R",
    (pid == 3 | pid == 4) & pid_lean == 3 ~ "I")
  ) %>%
  mutate(pid7 = case_when(
    pid == 1 & pid_strength == 1 ~ 1, #strong Dem
    pid == 1 & pid_strength == 2 ~ 2, #weak Dem 
    pid_lean == 2 ~ 3, #lean Dem    
    pid_lean == 3 ~ 4, #pure Ind
    pid_lean == 1 ~ 5, #lean Rep  
    pid == 2 & pid_strength == 2 ~ 6, #lean Rep    
    pid == 2 & pid_strength == 1 ~ 7, #strong Rep    
  )) 


block_covars <- 
   c(# demographics (treating age and income as continuous)
     "age",
     "gender_m",
     #"gender_w", #omit bc repeats info as gender_m var
     #"gender_nb",#omit bc no one is in design 3
     "race_white",
     "race_black_africanam",
     "race_amindian_aknative",
     "race_asian",
     "race_nativehawaiian_pi",
     "race_hispanic_latino",
     "race_arab_middleeastern", 
     "race_other",
     "inc",
    
    # pre-treatment political variables
     "pid7",
     "ideo",
     "ft_elites_pre_1", # Trump FT
     "ft_elites_pre_2", # Obama FT
     "ft_elites_pre_3", # Biden FT
     "ft_elites_pre_4", # Romney FT
     "ft_voters_pre_1", # Republican voters
     "ft_voters_pre_2", # Democratic voters
    
    # pre-treatment measures of the outcome
    "salestax_pre",
    "pension_pre", # outcome of interes
    "fedaudit_pre",
    "foreignaid_pre",
    "healthcare_pre")

df_block <- df %>%
  filter(design == 3) %>%
  select(id, all_of(block_covars))

# Check completeness
# 58 incompletes excluded *prior to* block randomized treatment
# (mentioned in Appendix Section E.3)
completes <- complete.cases(df_block)
table(completes)

df_block <- df_block[completes, ]


# Create blocks -----

# Not doing weighting or restricting the blocking wrt the main outcome
# of interest (pension_pre) in order to use the optimal
# algorithm (as opposed to optGreedy.)

set.seed(1234)
out <- block(data = df_block,
             n.tr = 2,
             id.vars = "id",
             block.vars = colnames(df_block)[-c(1)],
             algorithm = "optimal",
             distance = "mahalanobis")

# Assign treatment -----

## Treatment assignment within block for whether they are shown a
## party cue or not on each issue
cue_salestax <- assignment(out, seed = 1, namesCol = c("cue", "nocue"))
cue_pension <- assignment(out, seed = 2, namesCol = c("cue", "nocue"))
cue_fedaudit <- assignment(out, seed = 3, namesCol = c("cue", "nocue"))
cue_foreignaid <- assignment(out, seed = 4, namesCol = c("cue", "nocue"))
cue_healthcare <- assignment(out, seed = 5, namesCol = c("cue", "nocue"))

## Then, treatment assignment within each block for whether
## they state their opinion right away or not
state_pref <- assignment(out, seed = 6, namesCol = c("state", "wait"))

# Reorganize -----

# Create a block id variable
block_ids <- out$blocks$`1` %>%
  mutate(block_id = 1:n()) %>%
  pivot_longer(cols = `Unit 1`:`Unit 2`) %>%
  rename(id = value)

# Add assignments and block id to the data
df_out <- df_block %>%
  mutate(cue_salestax = extract_conditions(cue_salestax, df_block, "id")-1) %>%
  mutate(cue_pension = extract_conditions(cue_pension, df_block, "id")-1) %>%
  mutate(cue_fedaudit = extract_conditions(cue_fedaudit, df_block, "id")-1) %>%
  mutate(cue_foreignaid = extract_conditions(cue_foreignaid, df_block, "id")-1) %>%
  mutate(cue_healthcare = extract_conditions(cue_healthcare, df_block, "id")-1) %>%
  mutate(state_pref = extract_conditions(state_pref, df_block, "id")-1) %>%
  mutate(design = 3) %>%
  plyr::join(block_ids) %>%
  # removed unneeded blocking covariates
  select(-(age:healthcare_pre)) %>%
  filter(!is.na(Distance)) %>%
  # add all ids of people to invite back from designs 1 and 2
  bind_rows(df[,c("id", "design")] %>%
              filter(design %in% c(1,2))) %>%
  select(-Distance, -name)

# Save objects -----

write.csv(df_out, file = "data/processed_data/TappinHewitt-blockrandtreatments.csv")

# embedded data to use in Qualtrics
# this structure: [{"WorkerID": "1234", "design ": 1},{"WorkerID": "5678", "design": 2}, ...]
x <- paste0('{"WorkerID": "', df_out$participantId, 
            '", "design": "', df_out$design,
            '", "cue_salestax": "', df_out$cue_salestax,
            '", "cue_pension": "', df_out$cue_pension,
            '", "cue_fedaudit": "', df_out$cue_fedaudit,
            '", "cue_foreignaid": "', df_out$cue_foreignaid,
            '", "cue_healthcare": "', df_out$cue_healthcare,
            '", "state_pref": "', df_out$state_pref,
            '"',
            '}')
x <- paste0(x, collapse = ",")
x <- paste0('[', x, ']')
write_lines(x = x, file = "data/processed_data/TappinHewitt-embdata.txt")

